Install any external packages
!pip install altair
!pip install vega_datasets
Import utilized packages
import numpy as np
import os
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from vega_datasets import data
data
Import and clean pre 2021 data
world_data = pd.read_csv('world-happiness-report.csv')
df = world_data.copy()
df = df.drop(columns = ['Positive affect','Negative affect'])
df
Import and clean 2021 data
recent_data = pd.read_csv('world-happiness-report-2021.csv')
df2 = recent_data.copy()
df2['year'] = 2021
df2 = df2.drop(columns = ['Standard error of ladder score','upperwhisker','lowerwhisker','Ladder score in Dystopia',
'Explained by: Log GDP per capita','Explained by: Social support','Explained by: Healthy life expectancy',
'Explained by: Freedom to make life choices','Explained by: Generosity','Explained by: Perceptions of corruption',
'Dystopia + residual'])
df2 = df2.rename(columns = {'Ladder score':'Life Ladder','Logged GDP per capita':'Log GDP per capita',
'Healthy life expectancy':'Healthy life expectancy at birth'})
df2
Added countries to Regions that were not present in 2021 data and added column to pre 2021 data with Region indicator
dic = {}
for i in df2['Regional indicator'].unique():
dic[i]=list(df2[df2['Regional indicator'] ==i].groupby('Country name').size().index)
dic['Sub-Saharan Africa'].append('Angola')
dic['Latin America and Caribbean'].append('Belize')
dic['South Asia'].append('Bhutan')
dic['Sub-Saharan Africa'].append('Central African Republic')
dic['Sub-Saharan Africa'].append('Congo (Kinshasa)')
dic['Latin America and Caribbean'].append('Cuba')
dic['Sub-Saharan Africa'].append('Djibouti')
dic['Latin America and Caribbean'].append('Guyana')
dic['Middle East and North Africa'].append('Oman')
dic['Middle East and North Africa'].append('Qatar')
dic['Sub-Saharan Africa'].append('Somalia')
dic['Sub-Saharan Africa'].append('Somaliland region')
dic['Sub-Saharan Africa'].append('South Sudan')
dic['Middle East and North Africa'].append('Sudan')
dic['Latin America and Caribbean'].append('Suriname')
dic['Middle East and North Africa'].append('Syria')
dic['Latin America and Caribbean'].append('Trinidad and Tobago')
def find_region(x):
'''Helper Function to Return Region Name based on the Country Name entered'''
for reg in dic.keys():
for c in dic[reg]:
if x == c:
return reg
df['Regional indicator'] = df['Country name'].apply(find_region)
df
Combines all data into one dataframe
cmbd_df = pd.concat([df,df2])
cmbd_df = cmbd_df.reset_index(drop = True)
#cmbd_df = df.merge(df2, left_index=True, right_index=True)
cmbd_df
Describes all data with basic statistics
This was used so we had a basic understanding of our quantitative variables
cmbd_df.describe()
Return Nulls per column
cmbd_df.isnull().sum(axis = 0)
Return Nulls per column of 2021 data
Was useful to find that no Null data came from 2021
df2.isnull().sum(axis = 0)
Creates new DataFrame agregated by year and Regional indicator
ag_year = cmbd_df.groupby(['year','Regional indicator']).mean()
ag_year.reset_index(level=0,inplace=True)
ag_year.reset_index(level=0,inplace=True)
ag_year
For our project, we base our analysis on the assumption that a country's Life Ladder Score, or the value from 0-10 that represents the worst to best possible life for you,is an accurate measure of a country's happiness. Given this assumption, we begin our visualization based analysis by gauging the charts of average life ladder score by region, in order to look for if patterns are present in which parts of the world tend to have higher life ladder scores and which parts tend to have lower life ladder scores
Here we have some interesting finds right off the bat, Sub-Saharan Africa and South Asia by far have the lowest average life ladder scores accross the board, while North America and ANZ and Western Europe have the highest average life ladder scores by relatively significant margins. What we find very interesting is that the bottom two and the top two regions have very similar or identical scores from 2019 on.
We decided to do some further visualizations on the Life Ladder Score to show the data from multiple perspectives
ag_region = alt.Chart(cmbd_df, title = 'Life Ladder by Region').mark_bar().encode(x = alt.X('Regional indicator:N', sort = '-y'),
y= 'mean_score:Q', tooltip = ['Regional indicator','mean_score:Q']
).transform_aggregate(
mean_score = 'mean(Life Ladder)', groupby=['Regional indicator'])
ag_region
ag_year = alt.Chart(cmbd_df).mark_line().encode(x = 'year:O',y='mean_score:Q').transform_aggregate(
mean_score = 'mean(Life Ladder)', groupby=['year']).properties(title = 'Average global Life Ladder Score Over the Years')
ag_year
The graph above shows the average global life ladder score. What stands out is that a large dip occured at 2006, and a noticeable dip occurred with the start of the pandemic
def graph_stat(x):
region_select = alt.selection_multi(fields=['Regional indicator'] ,name="Select")
make_selector = alt.Chart(cmbd_df,title = 'Select Region').mark_rect(
).encode(y='Regional indicator', color=alt.condition(region_select,
'Regional indicator',alt.value('lightgray'))).add_selection(region_select)
ag_year_and_region = alt.Chart(cmbd_df, title = x + ' by Region').mark_line().encode(x = 'year:O',y='mean:Q',
color = alt.condition(region_select,'Regional indicator',
alt.value('lightgray')), tooltip = ['Regional indicator','year']
).transform_aggregate(
mean = 'mean(' + x + ')', groupby=['year','Regional indicator'])
return make_selector | ag_year_and_region
alt.Chart(cmbd_df).mark_rect().encode(
x='year:O',
y='Regional indicator:N',
color='mean_Life_Ladder:Q'
).transform_aggregate(
mean_Life_Ladder = 'mean(Life Ladder)', groupby=['year','Regional indicator'])
graph_stat('Life Ladder')
def clean_names(x):
'''Helper Function to make sure all Country names are in the correct format'''
if x == 'Bolivia (Plurinational State of)':
return 'Bolivia'
elif x == 'Congo':
return 'Congo (Brazzaville)'
elif x == 'Congo, Democratic Republic of the':
return 'Congo (Kinshasa)'
elif x == 'Czechia':
return 'Czech Republic'
elif x == 'Iran (Islamic Republic of)':
return 'Iran'
elif x == 'Moldova, Republic of':
return 'Moldova'
elif x == 'Russian Federation':
return 'Russia'
elif x == 'Korea, Republic of':
return 'South Korea'
elif x == 'Syrian Arab Republic':
return 'Syria'
elif x == 'Tanzania, United Republic of':
return 'Tanzania'
elif x == 'United Kingdom of Great Britain and Northern Ireland':
return 'United Kingdom'
elif x == 'USA':
return 'United States'
elif x == 'Venezuela (Bolivarian Republic of)':
return 'Venezuela'
elif x == 'Viet Nam':
return 'Vietnam'
elif x == "Côte d'Ivoire":
return 'Ivory Coast'
elif x == "Lao People's Democratic Republic":
return 'Laos'
elif x == 'Eswatini':
return 'Swaziland'
else:
return x
Code to make DataFrames for Choropleth map
codes_df = pd.read_csv('country_codes.csv')
codes_df['name'] = codes_df['name'].apply(clean_names)
ag_countries = cmbd_df.groupby('Country name').mean()
ag_countries.reset_index(level=0,inplace=True)
ag_countries = ag_countries.join(codes_df.set_index('name'),how='outer',on='Country name')
no_nulls = ag_countries.fillna(0)
Segment of code used to help create "clean_names"
def find_name(x):
'''Helper function to find the prescence of a substring'''
cur = 'rit'
return cur in x
codes_df[codes_df['name'].apply(find_name)]
countries = alt.topo_feature(data.world_110m.url, 'countries')
colors = alt.Chart(countries,title = 'Life Ladder by Country'
).mark_geoshape().encode(color = 'Life Ladder:Q',
tooltip = ['Country name:N','Life Ladder:Q']
).transform_lookup(lookup = 'id',
from_ = alt.LookupData(ag_countries,'id',['Country name','Life Ladder'])
).properties(width=500,height=300)
gray = alt.Chart(countries
).mark_geoshape(fill='gray').encode(color = 'Life Ladder:Q',
).transform_lookup(lookup = 'id',
from_ = alt.LookupData(no_nulls,'id',['Country name','Life Ladder'])
).properties(width=500,height=300)
gray + colors
After looking at global life ladder scores from around the globe through several perspectives, we decided to investigate what features correlate strongly to both a high and low life ladder, and to see if there are features which have minimal effect on life ladder score
corrmat = cmbd_df.corr()
f, ax = plt.subplots()
sns.heatmap(corrmat, square=True)
cmbd_df[cmbd_df.index.duplicated()]
sns.pairplot(cmbd_df)
Graph GDP over time by Region
Social Support over time by Region
graph_stat('Social support')
Life Expectancy over time by Region
graph_stat('Healthy life expectancy at birth')
Freedom over time by Region
graph_stat('Freedom to make life choices')
Generosity over time by Region
graph_stat('Generosity')
Graph Corruption over time by Region
graph_stat('Perceptions of corruption')
Now that we've looked at regional trends, let's focus on a few specific regions to try to understand what influences life ladder, and if there are different variables at play in different cultural regions
alt.Chart(cmbd_df[cmbd_df['Regional indicator'] == 'North America and ANZ']).mark_point().encode(
x = 'year:O',y='Life Ladder:Q',color = 'Country name')
alt.Chart(cmbd_df[cmbd_df['Regional indicator'] == 'East Asia']).mark_point().encode(
x = 'year:O',y='Life Ladder:Q',color = 'Country name')
One our main goals is to better understand the role that wealth plays in a countries happiness level, and so therefor the following sections will dive deep into the role, if any, that gdp plays in influencing the life ladder score
graph_stat('Log GDP per capita')
Very Long graph, but a good visual of Life Ladder vs. GDP. Life Ladder and GDP appear to have a posative relationship but it is far from a direct correlation.
alt.Chart(cmbd_df, title = 'Life Ladder vs. GDP').mark_circle(size = 100).encode(x = alt.X('Log GDP per capita:N',
axis = alt.Axis(values=list(range(6,12)))),
y = 'Life Ladder:Q').properties(width = 800, height = 500)
When Aggregated by country, the relationship between Life Ladder and GDP becomes much more defined as a posative correlation
alt.Chart(cmbd_df, title = 'Life Ladder vs. GDP Aggregated by Country').mark_circle(size = 200).encode(
x = alt.X('mean_GDP:N', axis = alt.Axis(values=list(range(6,12)))),y = 'mean_score:Q',
color = 'Regional indicator:N', tooltip = ['Country name','Regional indicator']
).transform_aggregate(mean_score = 'mean(Life Ladder)', mean_GDP = 'mean(Log GDP per capita)',
groupby=['Country name','Regional indicator']).properties(width = 700, height = 500)
Now that we've done some overall analysis worldwide and regional on the trends gdp has on life ladder score, we must focus on the poorest and richest 5 nations to make solid conclusions on if gdp does play a decisive factor in influencing life ladder score
specific_cmbd_df = cmbd_df.loc[cmbd_df['Log GDP per capita'] >0]
#remove rows where no gdp was recorded so as to avoid skewing our data
average_scores_per_country = specific_cmbd_df.groupby('Country name').mean()
average_scores_per_country = average_scores_per_country.sort_values(by='Log GDP per capita')
average_scores_per_country
poorest_countries = average_scores_per_country.head(5)
richest_countries = average_scores_per_country.tail(5)
poorest_and_richest = pd.concat([poorest_countries,richest_countries])
wealth_indicator = []
for i in poorest_and_richest['Log GDP per capita']:
if i <10:
wealth_indicator.append(0)
else:
wealth_indicator.append(1)
poorest_and_richest['Wealth Indicator'] = wealth_indicator
poorest_and_richest['country name']= poorest_and_richest.index
poorest_and_richest
alt.Chart(poorest_and_richest).mark_bar().encode(
x = alt.X('country name:N', sort = 'y'),
y='Life Ladder',
color = 'Wealth Indicator',
tooltip = ['country name:N','Life Ladder:Q']
)
Significant differences are present in life ladder score between the wealthy and the poor nations, the lowest wealthy nation is 2 full ladder steps above the highest poor nation.
alt.Chart(poorest_and_richest).mark_point().encode(
x = 'Log GDP per capita:Q',
y='Life Ladder:Q',
color = 'Wealth Indicator',
tooltip = ['country name:N','Life Ladder:Q','Log GDP per capita:Q' ])
Very strong clustering is present betweeen the wealthy and poor subsets, gdp can be assumed to play a decisive role in predicting whether a nation will have a high life ladder score
After analyzing the dependency life ladder has on log gdp per capita, we decided to see if the results seen above will be replicated when the variable axises are switched, and the nations chosen are instead the 5 happiest and least happy nations
sorted_by_happiness = average_scores_per_country.sort_values(by='Life Ladder')
least_happy_countries = sorted_by_happiness.head(5)
happiest_countries = sorted_by_happiness.tail(5)
happy_and_sad = pd.concat([least_happy_countries,happiest_countries])
happiness_indicator = []
for i in happy_and_sad['Life Ladder']:
if i <7:
happiness_indicator.append(0)
else:
happiness_indicator.append(1)
happy_and_sad['Happiness Indicator'] = happiness_indicator
happy_and_sad['country name']= happy_and_sad.index
happy_and_sad
alt.Chart(happy_and_sad).mark_bar().encode(
x = alt.X('country name:N', sort = 'y'),
y='Log GDP per capita',
color = 'Happiness Indicator',
tooltip = ['country name:N','Log GDP per capita:Q']
)
The results hear show even more significant differneces than the previous bar chart, it appears that they are interchangeable with each other, and that life ladder/gdp are powerful indicators of the other
alt.Chart(happy_and_sad).mark_point().encode(
x = 'Life Ladder:Q',
y='Log GDP per capita:Q',
color = 'Happiness Indicator',
tooltip = ['country name:N','Life Ladder:Q','Log GDP per capita:Q' ])
The clustering present with log gdp as the dependent variable is even stronger than the reverse, we can decisively conclude that gdp and life ladder are hihgly effective features in predicting one another, and therfore share a strong relationship.
poorest_countries.describe()
richest_countries.describe()
least_happy_countries.describe()
happiest_countries.describe()